Loading libraries

library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(corrplot)
library(caret)
library(data.table)

Reading data

removable_columns <- c("title", "pdb_code", "res_id", "chain_id", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "skeleton_data", "skeleton_cycle_4", "skeleton_diameter", "skeleton_cycle_6", "skeleton_cycle_7", "skeleton_closeness_006_008", "skeleton_closeness_002_004", "skeleton_cycle_3", "skeleton_avg_degree", "skeleton_closeness_004_006", "skeleton_closeness_010_012", "skeleton_closeness_012_014", "skeleton_edges", "skeleton_radius", "skeleton_cycle_8_plus", "skeleton_closeness_020_030", "skeleton_deg_5_plus", "skeleton_closeness_016_018", "skeleton_closeness_008_010", "skeleton_closeness_018_020", "skeleton_average_clustering", "skeleton_closeness_040_050", "skeleton_closeness_014_016", "skeleton_center", "skeleton_closeness_000_002", "skeleton_density", "skeleton_closeness_030_040", "skeleton_deg_4", "skeleton_deg_0", "skeleton_deg_1", "skeleton_deg_2", "skeleton_deg_3", "skeleton_graph_clique_number", "skeleton_nodes", "skeleton_cycles", "skeleton_cycle_5", "skeleton_closeness_050_plus", "skeleton_periphery", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step")

data <- fread("./all_summary.csv", nrows = 10000, header = TRUE, drop = removable_columns)
dim(data)
## [1] 10000   350

Processing missing data

dim(data)
## [1] 10000   350
data <- data %>% 
  drop_na()
dim(data)
## [1] 8958  350

Deleting chosen ligands

deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 8910  350

Data summary

statistics <- data %>%
  select(res_name, blob_volume_coverage, blob_volume_coverage_second)
kable(summary(statistics))
res_name blob_volume_coverage blob_volume_coverage_second
Length:8910 Min. :0.02305 Min. :0.00000
Class :character 1st Qu.:0.50648 1st Qu.:0.00000
Mode :character Median :0.72244 Median :0.00000
NA Mean :0.66784 Mean :0.02067
NA 3rd Qu.:0.86480 3rd Qu.:0.00000
NA Max. :1.00000 Max. :0.95385
dim(data)
## [1] 8910  350

Cardinality of ligands by name

plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("ligand")+
  labs(title = "Cardinality of ligands by name")

ggplotly(plot_ligands)

Correlation between variables

data %>%
  select_if(is.numeric) %>%
  cor %>%
  corrplot(method = "circle", tl.col = "black", tl.srt = 45)
## Warning in cor(.): odchylenie standardowe wynosi zero

Distribution of atom and electron count

plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) + 
  geom_density(alpha = .3, fill = "#00CECB", color = NA) +
  xlab("atom count") +
  labs(title = "Atom count distribution")

ggplotly(plot_atom)
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) + 
  geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
  xlab("electron count") +
  labs(title = "Electron count distribution")

ggplotly(plot_electron)

Distribution of part_01 columns

plot_part_data <- data %>%
  select(contains("part_01")) %>%
  gather(part, value, 1:106)

dim(plot_part_data)
## [1] 661334      2
# plot_part_data_1 <- plot_part_data[1:118926,]
# plot_part_data_2 <- plot_part_data[118927:237852,]
# plot_part_data_3 <- plot_part_data[237853:356778,]
# plot_part_data_4 <- plot_part_data[356779:475704,]
# plot_part_data_5 <- plot_part_data[475705:594630,]
# plot_part_data_6 <- plot_part_data[594631:700342,]
# 
# plot_ly(plot_part_data_1, x = plot_part_data_1$part, y = plot_part_data_1$value, type = 'box')
# plot_ly(plot_part_data_2, x = plot_part_data_2$part, y = plot_part_data_2$value, type = 'box')
# plot_ly(plot_part_data_3, x = plot_part_data_3$part, y = plot_part_data_3$value, type = 'box')
# plot_ly(plot_part_data_4, x = plot_part_data_4$part, y = plot_part_data_4$value, type = 'box')
# plot_ly(plot_part_data_5, x = plot_part_data_5$part, y = plot_part_data_5$value, type = 'box')
# plot_ly(plot_part_data_6, x = plot_part_data_6$part, y = plot_part_data_6$value, type = 'box')

Greatest inconsistency in classes

Atom

data %>%
  select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
  group_by(res_name) %>%
  summarise(atom_inconsistency = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
  arrange(-atom_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name atom_inconsistency
PLC 17.1481481
LHG 4.4615385
C8E 2.6428571
NDP 1.7333333
NAP 1.5090909
PG4 1.4225352
MLY 1.2222222
CME 1.0000000
MAN 1.0000000
NAG 0.9949495

Electron

data %>%
  select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
  group_by(res_name) %>%
  summarise(electron_inconsistency = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
  arrange(-electron_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name electron_inconsistency
PLC 114.444444
LHG 34.096154
C8E 16.714286
NDP 11.333333
NAP 10.654545
PG4 9.633803
MLY 9.370370
CME 8.000000
MAN 8.000000
NAG 7.959596

Sekcję sprawdzającą czy na podstawie wartości innych kolumn można przewidzieć liczbę elektronów i atomów oraz z jaką dokładnością można dokonać takiej predykcji; trafność regresji powinna zostać oszacowana na podstawie miar R^2 i RMSE;

Linear regression

Electron count

data_partition <- data %>%
  select_if(is.numeric)
  
set.seed(111)

partition <- createDataPartition(y = data_partition$local_res_atom_non_h_electron_sum, p = 0.7, list = FALSE)
data_train <- data %>% 
  slice(partition)
data_test <- data %>%
  slice(-partition)

dim(data_train)
## [1] 4369  350
dim(data_test)
## [1] 1870  350
# set.seed(111)
# fit <- train(local_res_atom_non_h_electron_sum ~ ., data = data_train, method = "lm")
# set.seed(111)
# prediction <- predict(fit, newdata = data_test)
# fit

Atom count